This notebook cross-validates the CLTK's part-of-speech taggers. The final results are found at the bottom.


In [1]:
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tag import AffixTagger
from nltk.tag import BigramTagger
from nltk.tag import tnt
from nltk.tag import TrigramTagger
from nltk.tag import UnigramTagger
from nltk.tokenize import wordpunct_tokenize
import math
import os
import pandas as pd
import random
from statistics import mean
from statistics import stdev
import time

In [2]:
full_training_set_rel = '~/greek_treebank_perseus/greek_training_set.pos'
full_training_set = os.path.expanduser(full_training_set_rel)

In [3]:
# This section's code is good, but it times out in IPython.
# Consider using it in separate scripts.
unigram_accuracies = []
bigram_accuracies = []
trigram_accuracies = []
backoff_accuracies = []
tnt_accuracies = []

with open(full_training_set) as f:
    training_set_string = f.read()

pos_set = training_set_string.split('\n\n')  # mk into a list

sentence_count = len(pos_set)  # 24825 
tenth = math.ceil(int(sentence_count) / int(10))

random.shuffle(pos_set)

def chunks(l, n):
    """Yield successive n-sized chunks from l.
    http://stackoverflow.com/a/312464
    """
    for i in range(0, len(l), n):
        yield l[i:i+n]

# a list of 10 lists
ten_parts = list(chunks(pos_set, tenth))  # a list of 10 lists with ~2483 sentences each

#for counter in list(range(10)):
for counter, part in list(enumerate(ten_parts)):
    
    start = time.time()
    
    # map test list to part of given loop
    test_set = ten_parts[counter]  # or: test_set = part
    
    # filter out this loop's test index
    training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]]
    
    # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
    training_set = [item for sublist in training_set_lists for item in sublist]
        
    # save shuffled tests to file
    # there might be a way of getting 
    local_dir_rel = '~/cltk_data/user_data'
    local_dir = os.path.expanduser(local_dir_rel)
    if not os.path.isdir(local_dir):
        os.makedirs(local_dir)

    test_path = os.path.join(local_dir, 'test_greek.pos')
    with open(test_path, 'w') as f:
        f.write('\n\n'.join(test_set))

    train_path = os.path.join(local_dir, 'train_greek.pos')
    with open(train_path, 'w') as f:
        f.write('\n\n'.join(training_set))

    # read POS corpora
    train_reader = TaggedCorpusReader(local_dir, 'train_greek.pos')
    train_sents = train_reader.tagged_sents()

    test_reader = TaggedCorpusReader(local_dir, 'test_greek.pos')
    test_sents = test_reader.tagged_sents()
    
    print('Loop #' + str(counter))
    # make unigram tagger
    unigram_tagger = UnigramTagger(train_sents)
    # evaluate unigram tagger
    unigram_accuracy = None
    unigram_accuracy = unigram_tagger.evaluate(test_sents)
    unigram_accuracies.append(unigram_accuracy)
    print('Unigram:', unigram_accuracy)
    
    # make bigram tagger
    bigram_tagger = BigramTagger(train_sents)
    # evaluate bigram tagger
    bigram_accuracy = None
    bigram_accuracy = bigram_tagger.evaluate(test_sents)
    bigram_accuracies.append(bigram_accuracy)
    print('Bigram:', bigram_accuracy)
    
    # make trigram tagger
    trigram_tagger = TrigramTagger(train_sents)
    # evaluate trigram tagger
    trigram_accuracy = None
    trigram_accuracy = trigram_tagger.evaluate(test_sents)
    trigram_accuracies.append(trigram_accuracy)
    print('Trigram:', trigram_accuracy)
    
    # make 1, 2, 3-gram backoff tagger
    tagger1 = UnigramTagger(train_sents)
    tagger2 = BigramTagger(train_sents, backoff=tagger1)
    tagger3 = TrigramTagger(train_sents, backoff=tagger2)
    # evaluate trigram tagger
    backoff_accuracy = None
    backoff_accuracy = tagger3.evaluate(test_sents)
    backoff_accuracies.append(backoff_accuracy)
    print('1, 2, 3-gram backoff:', backoff_accuracy)
    
    # make tnt tagger
    tnt_tagger = tnt.TnT(N=100)  # N=1000 is default but but won't finish with Greek
    tnt_tagger.train(train_sents)
    # evaulate tnt tagger
    tnt_accuracy = None
    tnt_accuracy = tnt_tagger.evaluate(test_sents)
    tnt_accuracies.append(tnt_accuracy)
    print('TnT:', tnt_accuracy)
    
    print('Runtime:', time.time() - start)
    print()


Loop #0
Unigram: 0.8170218852810757
Bigram: 0.2537548276355314
Trigram: 0.1938492347303676
1, 2, 3-gram backoff: 0.820569303390073
TnT: 0.8299241882420254
Runtime: 631.0014109611511

Loop #1
Unigram: 0.8126170079990923
Bigram: 0.25401372893856017
Trigram: 0.19186475293583707
1, 2, 3-gram backoff: 0.8170987689340217
TnT: 0.8242752595450161
Runtime: 592.7357261180878

Loop #2
Unigram: 0.8172981269363044
Bigram: 0.2615183469289145
Trigram: 0.20057982548389847
1, 2, 3-gram backoff: 0.8219310462439247
TnT: 0.8330727908364836
Runtime: 705.8858499526978

Loop #3
Unigram: 0.8176651870281886
Bigram: 0.2541773869772965
Trigram: 0.19336141819107128
1, 2, 3-gram backoff: 0.8211710820209788
TnT: 0.8313494868387571
Runtime: 711.4081230163574

Loop #4
Unigram: 0.8136574721647177
Bigram: 0.2548816650814077
Trigram: 0.19680523694958876
1, 2, 3-gram backoff: 0.8151121803838192
TnT: 0.8273932747720024
Runtime: 702.5740129947662

Loop #5
Unigram: 0.812089356110381
Bigram: 0.2519710906701708
Trigram: 0.18851292159439334
1, 2, 3-gram backoff: 0.8137593079281646
TnT: 0.8251752080595708
Runtime: 545.2309489250183

Loop #6
Unigram: 0.8137136093340472
Bigram: 0.25987092410450074
Trigram: 0.2011949384212158
1, 2, 3-gram backoff: 0.8169264154665614
TnT: 0.827494856691937
Runtime: 540.8957121372223

Loop #7
Unigram: 0.8149580548841178
Bigram: 0.2555950518981942
Trigram: 0.1952509597611261
1, 2, 3-gram backoff: 0.8230058296601734
TnT: 0.8286933030001422
Runtime: 513.0061490535736

Loop #8
Unigram: 0.8165790287658522
Bigram: 0.252228439670444
Trigram: 0.19494980738407897
1, 2, 3-gram backoff: 0.8170570536793859
TnT: 0.8296544161066277
Runtime: 492.7715480327606

Loop #9
Unigram: 0.8166695006518165
Bigram: 0.2617185285949102
Trigram: 0.201241285495664
1, 2, 3-gram backoff: 0.8200986226832171
TnT: 0.829790851895936
Runtime: 556.2160589694977


In [4]:
final_accuracies_list = []
mean_accuracy_unigram = mean(unigram_accuracies)
standard_deviation_unigram = stdev(unigram_accuracies)
uni = {'unigram': {'mean': mean_accuracy_unigram, 'sd': standard_deviation_unigram}}
final_accuracies_list.append(uni)

mean_accuracy_bigram = mean(bigram_accuracies)
standard_deviation_bigram = stdev(bigram_accuracies)
bi = {'bigram': {'mean': mean_accuracy_bigram, 'sd': standard_deviation_bigram}}
final_accuracies_list.append(bi)

mean_accuracy_trigram = mean(trigram_accuracies)
standard_deviation_trigram = stdev(trigram_accuracies)
tri = {'trigram': {'mean': mean_accuracy_trigram, 'sd': standard_deviation_trigram}}
final_accuracies_list.append(tri)

mean_accuracy_backoff = mean(backoff_accuracies)
standard_deviation_backoff = stdev(backoff_accuracies)
back = {'1, 2, 3-gram backoff': {'mean': mean_accuracy_backoff, 'sd': standard_deviation_backoff}}
final_accuracies_list.append(back)

mean_accuracy_tnt = mean(tnt_accuracies)
standard_deviation_tnt = stdev(tnt_accuracies)
tnt = {'tnt': {'mean': mean_accuracy_tnt, 'sd': standard_deviation_tnt}}
final_accuracies_list.append(tnt)

In [7]:
final_dict = {}
for x in final_accuracies_list:
    final_dict.update(x)

df = pd.DataFrame(final_dict)
df


Out[7]:
1, 2, 3-gram backoff bigram tnt trigram unigram
mean 0.818673 0.255973 0.828682 0.195761 0.815227
sd 0.003095 0.003686 0.002685 0.004242 0.002078